欢迎关注“小丫画图”公众号,回复“小白”,看小视频,实现点鼠标跑代码。

小丫微信: epigenomics E-mail:

作者:大鱼海棠

单位:中国药科大学国家天然药物重点实验室,生物统计与计算药学研究中心

小丫编辑校验

需求描述

过配对差异表达,画亚型特异性上(下)调基因的对角热图。比如样本分了4-5个组,分析并画出每个组里面上调或下调的热图。

Figure 2: A) Unsupervised clustering of lncRNAs identified 4 clusters: cluster I (related to the basal-like breast cancer subtype), cluster II (related to the HER-2 enriched subtype), cluster III (related to luminal A subtype), and cluster IV (related to luminal A and B subtypes). Correlation with PAM50 classification, estrogen receptor (ER), progesterone receptor (PR) and HER2 status are depicted.

出自https://www.ncbi.nlm.nih.gov/pubmed/25296969

应用场景

在大于等于3组亚型的基础上,得到配对差异表达基因后,计算每个亚型特异性上/下调的基因集,并绘制特异性基因表达热图。

环境设置

使用国内镜像安装包

options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")

加载包

library(ClassDiscovery) # 距离测量
## Loading required package: cluster
## Loading required package: oompaBase
library(edgeR) # 差异表达
## Loading required package: limma
library(NMF) # 绘制热图
## Loading required package: pkgmaker
## Loading required package: registry
## Loading required package: rngtools
## NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
##   To enable shared memory capabilities, try: install.extras('
## NMF
## ')
library(gplots) # 热图颜色
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:oompaBase':
## 
##     redgreen
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer) # 热图颜色

Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor

自定义配对比较函数

这里用edgeR,如果用DESeq2或limma,就把FigureYa118MulticlassDESeq2或FigureYa119MulticlassLimma里面相应的函数替换过来,可无缝对接。

# 创建需要配对比较的列表
createList <- function(group=NULL) {
  
  tumorsam <- names(group)
  sampleList = list()
  treatsamList =list()
  treatnameList <- c()
  ctrlnameList <- c()
  
  #A-1: 类1 vs 其他
  sampleList[[1]] = tumorsam
  treatsamList[[1]] = intersect(tumorsam, names(group[group=="Cluster1"])) # 亚型名称需要根据情况修改
  treatnameList[1] <- "Cluster1" # 该亚型的命名
  ctrlnameList[1] <- "Others" # 其他亚型的命名
  
  #A-2: 类2 vs 其他
  sampleList[[2]] = tumorsam
  treatsamList[[2]] = intersect(tumorsam, names(group[group=="Cluster2"]))
  treatnameList[2] <- "Cluster2"
  ctrlnameList[2] <- "Others"
  
  #A-3: 类3 vs 其他
  sampleList[[3]] = tumorsam
  treatsamList[[3]] = intersect(tumorsam, names(group[group=="Cluster3"]))
  treatnameList[3] <- "Cluster3"
  ctrlnameList[3] <- "Others"
  
  #A-4: 类4 vs 其他
  sampleList[[4]] = tumorsam
  treatsamList[[4]] = intersect(tumorsam, names(group[group=="Cluster4"]))
  treatnameList[4] <- "Cluster4"
  ctrlnameList[4] <- "Others"
  
  return(list(sampleList, treatsamList, treatnameList, ctrlnameList))
  
}

# 配对edgeR
twoclassedgeR <- function(res.path=NULL, countsTable=NULL, prefix=NULL, complist=NULL, overwt=FALSE) {
  
  #Groupinfo could contain "batch", which will be considered by edgeR design matrix
  sampleList <- complist[[1]]
  treatsamList <- complist[[2]]
  treatnameList <- complist[[3]]
  ctrlnameList <- complist[[4]]
  allsamples <- colnames(countsTable)
  
  options(warn=1)
  for (k in 1:length(sampleList)) { # 循环读取每一次比较的内容
    samples <- sampleList[[k]]
    treatsam <- treatsamList[[k]]
    treatname <- treatnameList[k]
    ctrlname <- ctrlnameList[k]
    
    compname <- paste(treatname, "_vs_", ctrlname, sep="") # 生成最终文件名
    tmp = rep("others", times=length(allsamples))
    names(tmp) <- allsamples
    tmp[samples]="control"
    tmp[treatsam]="treatment"
    outfile <- file.path( res.path, paste(prefix, "_edgeR_test_result.", compname, ".txt", sep="") )
    if (file.exists(outfile) & (overwt==FALSE)) { # 因此差异表达分析较慢,因此如果文件存在,在不覆盖的情况下(overwt=F)不再次计算差异表达
      cat(k, ":", compname, "exists and skipped;\n")
      next
    }
    
    saminfo <- data.frame("Type"=tmp[samples],"SampleID"=samples,stringsAsFactors = F)
    
    group=factor(saminfo$Type,levels = c("control","treatment"))    
    
    design <- model.matrix(~group) # 设计矩阵仅包含亚型信息,若有批次效应请修改,例如design <- model.matrix(~group+treat)
    rownames(design) <- samples
    
    # 差异表达过程,具体参数细节及输出结果解释,请参阅相关document
    y <- DGEList(counts=countsTable[,samples],group=saminfo$Type)
    y <- calcNormFactors(y)
    y <- estimateDisp(y, design, robust=TRUE)
    fit <- glmFit(y, design)
    lrt <- glmLRT(fit)
    ordered_tags <- topTags(lrt, n=100000)
    allDiff=ordered_tags$table
    allDiff=allDiff[is.na(allDiff$FDR)==FALSE,]
    diff=allDiff
    
    diff$id <- rownames(diff)
    res <- diff[,c("id","logFC","logCPM","LR","PValue","FDR")]
    colnames(res) <- c("id","log2FC","logCPM","LR","PValue","FDR")
    write.table(res, file=outfile, row.names=F, col.names=T, sep="\t", quote=F)
    cat(k, ",")
  }
  options(warn=0)
}

自定义画图函数

# 特异性基因计算和热图绘制
sigheat <- function(featdata=NULL, # 表达谱,注意一般是标准化后的FPKM或者TPM,此时norm选择none
                    DEfiles=NULL, # 差异表达结果文件名
                    outfile=NULL, # 输出文件名
                    hcs=NULL, # 聚类树结构
                    heatCol=greenred(128), #热图配色
                    # 如果你喜欢橙黄蓝,可以换成下面这行
                    #heatCol=colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
                    annCol=NULL, # 样本注释信息
                    annColors=NULL, # 注释信息的颜色
                    res.path=NULL, # 存储特异性表达的基因集的路径
                    fig.path=NULL, # 存储热图的路径
                    halfwidth=3, # 表达谱标准化后的上下限cutoff
                    padjcut=0.05, # fdr的阈值
                    log2fccut=1, # log2fc的阈值,默认为1
                    height=8, # 热图图片高度
                    width=8, # 热图的宽度
                    norm="none", # 输入数据是否需要标准化,默认不需要,可选择quantile,或者median标准化
                    dirct="UP", # 特异性基因的表达方向,默认为上调(推荐),可选择UP或DOWN
                    fontsize=8, # 热图的字号
                    labRow = F, # 热图是否显示基因名
                    labCol = F) {  # 热图是否显示样本名
  
  samabsFlag <- FALSE 
  if (is.null(hcs)) {samabsFlag <- TRUE}
  
  if(is.null(DEfiles)) {stop("DEfiles is NULL!")}
  if (!is.element(norm, c("none", "median", "quantile"))) {stop( "norm type error!") }
  if (!is.element(dirct, c("UP", "DOWN"))) {stop( "dirct type error!") }
  
  if(dirct=="UP") { outlabel <- "uniquely_significantly_overexpressed.txt" }
  if(dirct=="DOWN") { outlabel <- "uniquely_significantly_underexpressed.txt" }
  
  genelist <- c()
  for (filek in DEfiles) {
    DEres <- read.table(file.path(res.path, filek), header=T, row.names=NULL, sep="\t", quote="", stringsAsFactors=F)    
    DEres <- DEres[!duplicated(DEres[, 1]),]
    DEres <- DEres[!is.na(DEres[, 1]), ]
    rownames(DEres) <- DEres[, 1]
    DEres <- DEres[, -1]
    
    if (dirct=="UP") {
      genelist <- c( genelist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC > log2fccut, ]) )
    }
    if (dirct=="DOWN") {
      genelist <- c( genelist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC < -log2fccut, ]) )
    }
  }
  unqlist <- setdiff(genelist,genelist[duplicated(genelist)])
  
  for (filek in DEfiles) {
    DEres <- read.table(file.path(res.path, filek), header=T, row.names=NULL, sep="\t", quote="", stringsAsFactors=F)
    DEres <- DEres[!duplicated(DEres[, 1]),]
    DEres <- DEres[!is.na(DEres[, 1]), ]
    rownames(DEres) <- DEres[, 1]
    DEres <- DEres[, -1]
    
    if(dirct=="UP") {
      outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC > log2fccut, ]) )
    }
    if(dirct=="DOWN") {
      outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$FDR) & DEres$FDR < padjcut & !is.na(DEres$log2FC) & DEres$log2FC < -log2fccut, ]) )
    }
    write.table(outk, file=file.path(res.path, paste(filek, outlabel, sep="_")), row.names=F, col.names=F, sep="\t", quote=F)
  }
  
  if (sum(!is.element(unqlist, rownames(featdata)))>0) {stop("unqlist not found in featdata!")}
  
  samples <- colnames(featdata)
  if (norm=="quantile") {
    tmp <- normalize.quantiles(as.matrix(featdata))
    rownames(tmp) <- rownames(featdata)
    colnames(tmp) <- colnames(featdata)
  }
  if (norm=="median") {
    tmp <- sweep(featdata,2, apply(featdata,2,median,na.rm=T))
  }
  if(norm=="none") {
    tmp <- featdata
  }
  
  if (samabsFlag) {
    subdata <- tmp[unqlist, intersect(samples, colnames(tmp))]
  }else{
    subdata <- tmp[unqlist, samples]
  }
  
  hcg <- hclust(distanceMatrix(as.matrix(t(subdata)), "pearson"), "ward.D") 
  plotdata <- t(scale(t(subdata)))
  plotdata[plotdata > halfwidth] <- halfwidth
  plotdata[plotdata < - halfwidth] <- -halfwidth
  
  pdf(file=file.path(fig.path, outfile), height=height, width=width)
  Rowv <- NA
  if (samabsFlag) {
    if (ncol(annCol)<2) {
      tmp <- as.data.frame(annCol[colnames(plotdata), ])
      rownames(tmp) <- colnames(plotdata)
      colnames(tmp) <- colnames(annCol)
    }else{
      tmp <- annCol[colnames(plotdata), ]
    }
    if(isTRUE(labRow) & isTRUE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize)
    }
    if(isTRUE(labRow) & isFALSE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA)
    }
    if(isFALSE(labRow) & isTRUE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labRow = NA)
    } 
    if(isFALSE(labRow) & isFALSE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=NA, annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA, labRow = NA)
    }
    
  }else{
    if (ncol(annCol)<2) {
      tmp <- as.data.frame(annCol[samples, ])
      rownames(tmp) <- samples
      colnames(tmp) <- colnames(annCol)
    }else{
      tmp <- annCol[samples, ]
    }
    if(sum(hcs$labels!=colnames(plotdata))>0) {stop("colnames mismatch for aheatmap!")}
    if(isTRUE(labRow) & isTRUE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize)
    }
    if(isTRUE(labRow) & isFALSE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA)
    }
    if(isFALSE(labRow) & isTRUE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labRow = NA)
    } 
    if(isFALSE(labRow) & isFALSE(labCol)){
      aheatmap(plotdata, Rowv=Rowv, Colv=as.dendrogram(hcs), annCol=tmp, annColors=annColors, color=heatCol, revC=TRUE, breaks=0, fontsize=fontsize, labCol = NA, labRow = NA)
    }
  }
  invisible(dev.off())
  
  # return(unqlist)
}

输入文件

easy_input_counts.txt,easy_input_FPKM.txt,表达矩阵,分别是read count和FPKM。这里以read count为输入,用edgeR做配对比较,用FPKM画图。

# 设置文件路径
workdir <- "."

# 读取SKCM表达谱(count和对应的FPKM,或者TPM文件)
countstable <- read.table("easy_input_counts.txt",sep = "\t",row.names = 1,header = T,check.names = F,stringsAsFactors = F)

# FPKM与counts对应。如果你要把count转换为FPKM,可参考FigureYa34count2FPKM
FPKM <- read.table("easy_input_FPKM.txt",sep = "\t",row.names = 1,header = T,check.names = F,stringsAsFactors = F) #

# 如果你只有FPKM数据,打算用FigureYa119Multiclasslimma做配对比较,则只需提供easy_input_FPKM.txt
# 运行下面这行,不运行本段第一行
#countstable <- FPKM

all(colnames(countstable) == colnames(FPKM)) # 检查一下样本名
## [1] TRUE
all(rownames(countstable) == rownames(FPKM)) # 检查一下基因名
## [1] TRUE

聚类和配对差异表达分析

# 样本聚类 #
input <- log2(FPKM + 1) # 数据对数化
hcs <- hclust(distanceMatrix(as.matrix(input), "pearson"), "ward.D") # 样本聚类,更多聚类请参阅FigureYa91cluster_heatmap

group <- cutree(hcs, k = 4) # 样本分4类(实际上大于等于3类就可以)
group <- paste0("Cluster",as.numeric(group))
names(group) <- colnames(countstable)

#--------------#
# 配对差异表达 #
# 注意:该处使用的是FigureYa120MulticlassedgeR,若需使用其他差异表达算法请参阅FigureYa118MulticlassDESeq2,FigureYa119Multiclasslimma
complist <- createList(group=group)
twoclassedgeR(res.path = ".", #所有配对差异表达结果都会输出在res.path路径下
              countsTable = countstable,
              prefix = "SKCM", #文件名以SKCM开头
              complist = complist,
              overwt = F)
## 1 : Cluster1_vs_Others exists and skipped;
## 2 : Cluster2_vs_Others exists and skipped;
## 3 : Cluster3_vs_Others exists and skipped;
## 4 : Cluster4_vs_Others exists and skipped;

开始画图

#--------------------------------------#
# 创建样本注释和注释颜色以显示在热图上 #
annCol <- data.frame(Group = group,
                     row.names = names(group),
                     stringsAsFactors = F)
annColors <- list(Group = c("Cluster1"="yellow",
                            "Cluster2"="blue",
                            "Cluster3"="green",
                            "Cluster4"="red")) #如果你有更多类,就依此规律继续添加

#----------------------------#
# 计算特异性上调基因并画热图 #
sigheat(featdata = log2(FPKM + 1), # 数据已标准化
        # 注意:DEfiles的顺序是要进行调整的,当第一次输出热图后观察对应亚型色块所在的位置调整文件读入顺序
        DEfiles = c("SKCM_edgeR_test_result.Cluster4_vs_Others.txt",
                    "SKCM_edgeR_test_result.Cluster1_vs_Others.txt",
                    "SKCM_edgeR_test_result.Cluster2_vs_Others.txt",
                    "SKCM_edgeR_test_result.Cluster3_vs_Others.txt"),
        hcs = hcs, # 样本聚类信息
        annCol = annCol[colnames(FPKM),,drop = F], # 样本注释
        annColors = annColors, # 样本注释颜色
        res.path = workdir,
        fig.path = workdir,
        outfile = "skcm_sigheatmap_upexpressed.pdf",
        halfwidth = 2,
        padjcut = 0.05,
        log2fccut = 1,
        norm = "none", # 不标准化
        dirct = "UP", # 上调
        width = 8,
        height = 8)

#----------------------------#
# 计算特异性下调基因并画热图 #
sigheat(featdata = log2(FPKM + 1), # 数据已标准化
        # 注意:DEfiles的顺序是要进行调整的,当第一次输出热图后观察对应亚型色块所在的位置调整文件读入顺序
        DEfiles = c("SKCM_edgeR_test_result.Cluster4_vs_Others.txt",
                    "SKCM_edgeR_test_result.Cluster1_vs_Others.txt",
                    "SKCM_edgeR_test_result.Cluster2_vs_Others.txt",
                    "SKCM_edgeR_test_result.Cluster3_vs_Others.txt"),
        hcs = hcs, # 样本聚类信息
        annCol = annCol[colnames(FPKM),,drop = F], # 样本注释
        annColors = annColors, # 样本注释颜色
        res.path = workdir,
        fig.path = workdir,
        outfile = "skcm_sigheatmap_downexpressed.pdf",
        halfwidth = 2,
        padjcut = 0.05,
        log2fccut = 1,
        norm = "none", # 不标准化
        dirct = "DOWN", # 下调(不推荐)
        width = 8,
        height = 8)

Session Info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## Random number generation:
##  RNG:     Mersenne-Twister 
##  Normal:  Inversion 
##  Sample:  Rounding 
##  
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2    gplots_3.1.0          NMF_0.23.0           
##  [4] Biobase_2.48.0        BiocGenerics_0.34.0   rngtools_1.5         
##  [7] pkgmaker_0.31.1       registry_0.5-1        edgeR_3.30.3         
## [10] limma_3.44.3          ClassDiscovery_3.3.12 oompaBase_3.2.9      
## [13] cluster_2.1.0        
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.8.2       locfit_1.5-9.4     tidyselect_1.1.0   xfun_0.17         
##  [5] reshape2_1.4.4     purrr_0.3.4        lattice_0.20-41    colorspace_1.4-1  
##  [9] vctrs_0.3.4        generics_0.0.2     oompaData_3.1.1    htmltools_0.5.0   
## [13] yaml_2.2.1         rlang_0.4.7        pillar_1.4.6       glue_1.4.2        
## [17] withr_2.2.0        plyr_1.8.6         foreach_1.5.0      lifecycle_0.2.0   
## [21] stringr_1.4.0      munsell_0.5.0      gtable_0.3.0       caTools_1.18.0    
## [25] codetools_0.2-16   evaluate_0.14      knitr_1.29         doParallel_1.0.15 
## [29] Rcpp_1.0.5         KernSmooth_2.23-17 xtable_1.8-4       scales_1.1.1      
## [33] ggplot2_3.3.2      digest_0.6.25      stringi_1.5.3      dplyr_1.0.2       
## [37] grid_4.0.2         bibtex_0.4.2.3     bitops_1.0-6       tools_4.0.2       
## [41] magrittr_1.5       tibble_3.0.3       crayon_1.3.4       pkgconfig_2.0.3   
## [45] ellipsis_0.3.1     gridBase_0.4-7     assertthat_0.2.1   rmarkdown_2.3     
## [49] iterators_1.0.12   R6_2.4.1           mclust_5.4.6       compiler_4.0.2